{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "logging.basicConfig(level=logging.DEBUG)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))\n",
    "SOURCE_DIR\n",
    "sys.path.insert(0, SOURCE_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.\n",
      "DEBUG:h5py._conv:Creating converter from 7 to 5\n",
      "DEBUG:h5py._conv:Creating converter from 5 to 7\n",
      "DEBUG:h5py._conv:Creating converter from 7 to 5\n",
      "DEBUG:h5py._conv:Creating converter from 5 to 7\n",
      "DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/husein/dev/malaya/misc/normalize-twitter, universal_newlines=False, shell=None, istream=None)\n",
      "DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/husein/dev/malaya/misc/normalize-twitter, universal_newlines=False, shell=None, istream=None)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.49 s, sys: 3.23 s, total: 6.73 s\n",
      "Wall time: 2.61 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<module 'malaya' from '/home/husein/dev/malaya/malaya/__init__.py'>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import malaya\n",
    "malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /huseinzol05/language-model-bahasa-wiki-news/resolve/main/model.klm HTTP/1.1\" 302 0\n"
     ]
    }
   ],
   "source": [
    "lm = malaya.language_model.kenlm(model = 'bahasa-wiki-news')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /huseinzol05/stem-v2-lstm-bahdanau-noisy/resolve/main/model.pb HTTP/1.1\" 302 0\n",
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /huseinzol05/bpe/resolve/main/stemmer-v2.yttm HTTP/1.1\" 200 0\n",
      "DEBUG:malaya_boilerplate.frozen_graph:use_tensorrt: False\n",
      "DEBUG:malaya_boilerplate.frozen_graph:tensorrt_precision_mode: FP32\n",
      "DEBUG:malaya_boilerplate.frozen_graph:precision_mode: FP32\n",
      "DEBUG:malaya_boilerplate.frozen_graph:t5_graph: False\n",
      "DEBUG:malaya_boilerplate.frozen_graph:glowtts_graph: False\n",
      "DEBUG:malaya_boilerplate.frozen_graph:glowtts_multispeaker_graph: False\n",
      "2022-12-09 15:25:18.532818: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-12-09 15:25:18.537438: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
      "2022-12-09 15:25:18.537459: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n",
      "2022-12-09 15:25:18.537463: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n",
      "2022-12-09 15:25:18.537525: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n",
      "2022-12-09 15:25:18.537547: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.161.3\n"
     ]
    }
   ],
   "source": [
    "stemmer = malaya.stem.deep_model(model = 'noisy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/v27-preprocessing/bm_1grams.json\n",
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /huseinzol05/v27-preprocessing/resolve/main/bm_1grams.json HTTP/1.1\" 200 0\n"
     ]
    }
   ],
   "source": [
    "corrector = malaya.spelling_correction.probability.load(language_model = lm, replace_augmentation = True,\n",
    "                                                       stemmer = stemmer, maxlen = 12, minlen = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "normalizer = malaya.normalize.normalizer(corrector, stemmer, date = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# normalizer._tokenizer(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased/resolve/main/tokenizer_config.json HTTP/1.1\" 200 0\n",
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased/resolve/main/config.json HTTP/1.1\" 200 0\n"
     ]
    }
   ],
   "source": [
    "segmenter = malaya.segmentation.huggingface()\n",
    "segmenter_func = lambda x: segmenter.generate([x], max_length = 256)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_scorer = lambda x: lm.score(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = '14:14hrs Kerja penyelenggaraan KM 83.5- KM 84.5 arah Utara dari Ayer Hitam ke Yong Peng. Lorong kiri ditutup. Trafi https://t.co/iP2jEdY4Ib'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 'Tok aza.....semoga tok aza dapat jaga undang2 mcm mana tok aza jaga dewan masa jd speaker....hara\\npan tgi pada tok aza skrg ni'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:malaya.torch_model.huggingface:generate, initial_text: segmentasi: \n",
      "DEBUG:malaya.torch_model.huggingface:generate, strings: ['Tok aza.....semoga tok aza dapat jaga undang2 mcm mana tok aza jaga dewan masa jd speaker....hara\\npan tgi pada tok aza skrg ni']\n",
      "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['Tok aza.....semoga tok aza dapat jaga undang2 mcm mana tok aza jaga dewan masa jd speaker....hara pan tgi pada tok aza skrg ni']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "segmenter.generate([t], max_length = 512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased/resolve/main/tokenizer_config.json HTTP/1.1\" 200 0\n",
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n",
      "DEBUG:urllib3.connectionpool:https://huggingface.co:443 \"HEAD /mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased/resolve/main/config.json HTTP/1.1\" 200 0\n"
     ]
    }
   ],
   "source": [
    "true_case = malaya.true_case.huggingface()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 'JOHOR BAHRU, 27 April- 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from malaya.text.regex import (\n",
    "    _past_date_string,\n",
    "    _now_date_string,\n",
    "    _future_date_string,\n",
    "    _yesterday_tomorrow_date_string,\n",
    "    _depan_date_string,\n",
    "    _expressions,\n",
    "    _left_datetime,\n",
    "    _right_datetime,\n",
    "    _today_time,\n",
    "    _left_datetodaytime,\n",
    "    _right_datetodaytime,\n",
    "    _left_yesterdaydatetime,\n",
    "    _right_yesterdaydatetime,\n",
    "    _left_yesterdaydatetodaytime,\n",
    "    _right_yesterdaydatetodaytime,\n",
    ")\n",
    "from malaya.text.normalization import (\n",
    "    _remove_postfix,\n",
    "    _normalize_title,\n",
    "    _is_number_regex,\n",
    "    _string_to_num,\n",
    "    _replace_compound,\n",
    "    cardinal,\n",
    "    digit_unit,\n",
    "    rom_to_int,\n",
    "    ordinal,\n",
    "    fraction,\n",
    "    money,\n",
    "    ignore_words,\n",
    "    digit,\n",
    "    unpack_english_contractions,\n",
    "    repeat_word,\n",
    "    replace_laugh,\n",
    "    replace_mengeluh,\n",
    "    replace_betul,\n",
    "    digits,\n",
    "    normalize_numbers_with_shortform,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word = '903,000'\n",
    "word_lower = word.lower()\n",
    "(len(re.findall(r'(.)\\1{1}', word))\n",
    "and not word[0].isupper()\n",
    "and not word_lower.startswith('ke-')\n",
    "and not len(re.findall(_expressions['email'], word))\n",
    "and not len(re.findall(_expressions['url'], word))\n",
    "and not len(re.findall(_expressions['hashtag'], word))\n",
    "and not len(re.findall(_expressions['phone'], word))\n",
    "and not len(re.findall(_expressions['money'], word))\n",
    "and not len(re.findall(_expressions['date'], word))\n",
    "and not len(re.findall(_expressions['ic'], word))\n",
    "and not len(re.findall(_expressions['user'], word))\n",
    "and not len(re.findall(_expressions['number'], word))\n",
    "and not _is_number_regex(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# t_ = true_case.generate([t], max_length = 512)[0]\n",
    "# t_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:malaya.normalize:caching malaya.preprocessing.demoji inside normalizer\n",
      "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): raw.githubusercontent.com:443\n",
      "DEBUG:urllib3.connectionpool:https://raw.githubusercontent.com:443 \"GET /huseinzol05/malay-dataset/master/dictionary/emoji/demoji.json HTTP/1.1\" 200 52544\n",
      "DEBUG:malaya.normalize:before expand_contractions: JOHOR BAHRU, 27 April- 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:after expand_contractions: JOHOR BAHRU, 27 April- 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:tokenized: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa', 'Johor', 'yang', 'mula', 'diedar', 'https://t.co/IBGR0fYM93']\n",
      "DEBUG:malaya.normalize:before normalize_elongated: JOHOR BAHRU , 27 April - 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:after normalize_elongated: JOHOR BAHRU , 27 April - 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:before normalize_text: JOHOR BAHRU , 27 April - 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:after normalize_text: JOHOR BAHRU , 27 April - 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, queue: []\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, score: 0.00013451914133897459, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, lower: johor BAHRU , , normal: JOHOR BAHRU , , lower_score: -19.637670516967773, title_score: -17.512680053710938, normal_score: -11.153648376464844, upper_score: -11.153648376464844\n",
      "DEBUG:malaya.normalize:index: 0, word: JOHOR, condition titled\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, queue: ['JOHOR']\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, score: 5.916739980035546e-05, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, lower: JOHOR bahru , 27 , normal: JOHOR BAHRU , 27 , lower_score: -22.196605682373047, title_score: -20.8834228515625, normal_score: -15.063736915588379, upper_score: -15.063736915588379\n",
      "DEBUG:malaya.normalize:index: 1, word: BAHRU, condition titled\n",
      "DEBUG:malaya.normalize:index: 2, word: ,, queue: ['JOHOR', 'BAHRU']\n",
      "DEBUG:malaya.normalize:index: 2, word: ,, condition punct\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, queue: ['JOHOR', 'BAHRU', ',']\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, score: 0.0011754454990524658, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, condition check english\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, condition check malay\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, condition else for (word[0] == `x` and len(word) > 1 and norm text)\n",
      "DEBUG:malaya.normalize:index: 3, word: 27, condition is number\n",
      "DEBUG:malaya.normalize:index: 4, word: April, queue: ['JOHOR', 'BAHRU', ',', '27']\n",
      "DEBUG:malaya.normalize:index: 4, word: April, score: 0.00032630516156981773, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 4, word: April, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 4, word: April, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 4, word: April, lower: , 27 april - 903,000 , normal: , 27 April - 903,000 , lower_score: -26.24266242980957, title_score: -21.07174301147461, normal_score: -21.07174301147461, upper_score: -26.630231857299805\n",
      "DEBUG:malaya.normalize:index: 4, word: April, condition titled\n",
      "DEBUG:malaya.normalize:index: 5, word: -, queue: ['JOHOR', 'BAHRU', ',', '27', 'April']\n",
      "DEBUG:malaya.normalize:index: 5, word: -, score: 0.004704706470857255, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 5, word: -, condition else for (word[0] == `x` and len(word) > 1 and norm text)\n",
      "DEBUG:malaya.normalize:index: 5, word: -, end_result_string: , repeat: 1\n",
      "DEBUG:malaya.normalize:index: 5, word: -, condition normalize text\n",
      "DEBUG:malaya.normalize:index: 5, word: -, condition to spelling correction\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-']\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, score: 3.782883746097449e-05, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, condition check english\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, condition check malay\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, condition len(word) > 2 and norm text\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, condition else for (word[0] == `x` and len(word) > 1 and norm text)\n",
      "DEBUG:malaya.normalize:index: 6, word: 903,000, condition is number\n",
      "DEBUG:malaya.normalize:index: 7, word: ketua, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000']\n",
      "DEBUG:malaya.normalize:index: 7, word: ketua, score: 0.0008012150421608712, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 7, word: ketua, condition check english\n",
      "DEBUG:malaya.normalize:index: 7, word: ketua, condition check malay\n",
      "DEBUG:malaya.normalize:index: 8, word: isi, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua']\n",
      "DEBUG:malaya.normalize:index: 8, word: isi, score: 0.0006422582131468719, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 8, word: isi, condition check english\n",
      "DEBUG:malaya.normalize:index: 8, word: isi, condition check malay\n",
      "DEBUG:malaya.normalize:index: 9, word: rumah, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi']\n",
      "DEBUG:malaya.normalize:index: 9, word: rumah, score: 0.0017524161324617904, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 9, word: rumah, condition check english\n",
      "DEBUG:malaya.normalize:index: 9, word: rumah, condition check malay\n",
      "DEBUG:malaya.normalize:index: 10, word: di, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah']\n",
      "DEBUG:malaya.normalize:index: 10, word: di, score: 0.002181985396337787, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 10, word: di, condition check english\n",
      "DEBUG:malaya.normalize:index: 10, word: di, condition check malay\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di']\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, score: 0.00013451914133897459, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, lower: rumah di johor bakal menerima , normal: rumah di Johor bakal menerima , lower_score: -20.578521728515625, title_score: -15.515691757202148, normal_score: -15.515691757202148, upper_score: -21.205562591552734\n",
      "DEBUG:malaya.normalize:index: 11, word: Johor, condition titled\n",
      "DEBUG:malaya.normalize:index: 12, word: bakal, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor']\n",
      "DEBUG:malaya.normalize:index: 12, word: bakal, score: 0.0008419241482997328, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 12, word: bakal, condition check english\n",
      "DEBUG:malaya.normalize:index: 13, word: menerima, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal']\n",
      "DEBUG:malaya.normalize:index: 13, word: menerima, score: 0.000954248890481002, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 13, word: menerima, condition check english\n",
      "DEBUG:malaya.normalize:index: 13, word: menerima, condition check malay\n",
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, score: 0.0007869930162521621, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, lower: bakal menerima bantuan Beras Bangsa , normal: bakal menerima Bantuan Beras Bangsa , lower_score: -20.368301391601562, title_score: -19.99850082397461, normal_score: -19.99850082397461, upper_score: -25.98023223876953\n",
      "DEBUG:malaya.normalize:index: 14, word: Bantuan, condition titled\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan']\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, score: 0.0036940168338038225, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, lower: menerima Bantuan beras Bangsa Johor , normal: menerima Bantuan Beras Bangsa Johor , lower_score: -21.528759002685547, title_score: -19.1713924407959, normal_score: -19.1713924407959, upper_score: -23.87087631225586\n",
      "DEBUG:malaya.normalize:index: 15, word: Beras, condition titled\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras']\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, score: 0.0033790175139567025, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, lower: Bantuan Beras bangsa Johor yang , normal: Bantuan Beras Bangsa Johor yang , lower_score: -18.67375946044922, title_score: -17.220781326293945, normal_score: -17.220781326293945, upper_score: -21.91854476928711\n",
      "DEBUG:malaya.normalize:index: 16, word: Bangsa, condition titled\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa']\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, score: 0.00013451914133897459, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, condition not in money and date\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, condition text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, lower: Beras Bangsa johor yang mula , normal: Beras Bangsa Johor yang mula , lower_score: -24.01392936706543, title_score: -18.37645721435547, normal_score: -18.37645721435547, upper_score: -23.851085662841797\n",
      "DEBUG:malaya.normalize:index: 17, word: Johor, condition titled\n",
      "DEBUG:malaya.normalize:index: 18, word: yang, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa', 'Johor']\n",
      "DEBUG:malaya.normalize:index: 18, word: yang, score: 0.0018127209188164078, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 18, word: yang, condition check english\n",
      "DEBUG:malaya.normalize:index: 19, word: mula, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa', 'Johor', 'yang']\n",
      "DEBUG:malaya.normalize:index: 19, word: mula, score: 0.0009588931829133253, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 19, word: mula, condition check english\n",
      "DEBUG:malaya.normalize:index: 19, word: mula, condition check malay\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa', 'Johor', 'yang', 'mula']\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, score: 0.00015728680851909756, text_scorer is not None\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition check english\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition check malay\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition len(word) > 2 and norm text\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition else for (word[0] == `x` and len(word) > 1 and norm text)\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, end_result_string: , repeat: 1\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition normalize text\n",
      "DEBUG:malaya.normalize:index: 20, word: diedar, condition to spelling correction\n",
      "DEBUG:malaya.normalize:index: 21, word: https://t.co/IBGR0fYM93, queue: ['JOHOR', 'BAHRU', ',', '27', 'April', '-', '903,000', 'ketua', 'isi', 'rumah', 'di', 'Johor', 'bakal', 'menerima', 'Bantuan', 'Beras', 'Bangsa', 'Johor', 'yang', 'mula', 'diedar']\n",
      "DEBUG:malaya.normalize:index: 21, word: https://t.co/IBGR0fYM93, condition url\n",
      "DEBUG:malaya.normalize:spelling correction, index: 5, selected: -\n",
      "DEBUG:malaya.spelling_correction.probability:word: -, string: [',', '27', 'April', '-', '903,000', 'ketua', 'isi'], index: 5, lookback: 3, lookforward: 3, score: -28.970577239990234\n",
      "DEBUG:malaya.spelling_correction.probability:word: -, string: [',', '27', 'April', '-', '903,000', 'ketua', 'isi'], index: 5, lookback: 3, lookforward: 3, score: -28.970577239990234\n",
      "DEBUG:malaya.normalize:spelling correction, index: 20, selected: diedar\n",
      "DEBUG:malaya.normalize:past_date_string_: []\n",
      "DEBUG:malaya.normalize:now_date_string_: []\n",
      "DEBUG:malaya.normalize:future_date_string_: []\n",
      "DEBUG:malaya.normalize:yesterday_date_string_: []\n",
      "DEBUG:malaya.normalize:depan_date_string_: []\n",
      "DEBUG:malaya.normalize:today_time_: []\n",
      "DEBUG:malaya.normalize:time_: []\n",
      "DEBUG:malaya.normalize:left_datetime_: []\n",
      "DEBUG:malaya.normalize:right_datetime_: []\n",
      "DEBUG:malaya.normalize:today_left_datetime_: []\n",
      "DEBUG:malaya.normalize:today_right_datetime_: []\n",
      "DEBUG:malaya.normalize:left_yesterdaydatetime_: []\n",
      "DEBUG:malaya.normalize:right_yesterdaydatetime_: []\n",
      "DEBUG:malaya.normalize:left_yesterdaydatetodaytime_: []\n",
      "DEBUG:malaya.normalize:right_yesterdaydatetodaytime_: []\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'JOHOR BAHRU , 27 April - 903,000 ketua isi rumah di Johor bakal menerima Bantuan Beras Bangsa Johor yang mula diedar https://t.co/IBGR0fYM93'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalized = normalizer.normalize(\n",
    "            t, normalize_hingga = False, normalize_cardinal = False,\n",
    "            normalize_ordinal = False, normalize_pada_hari_bulan = False,\n",
    "            normalize_fraction = False, normalize_money = False, normalize_date = False,\n",
    "            normalize_time = False, normalize_ic = False,\n",
    "            normalize_units = False,\n",
    "            normalize_url = False, normalize_percent=False, normalize_telephone = False,\n",
    "            text_scorer = text_scorer, normalize_elongated = True,\n",
    "            not_a_word_threshold = 1e-9,\n",
    "        )['normalize']\n",
    "normalized"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "segmenter_func('14hrs')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from malaya.text.tatabahasa import (\n",
    "    date_replace,\n",
    "    consonants,\n",
    "    sounds,\n",
    "    bulan,\n",
    ")\n",
    "from malaya.text.function import (\n",
    "    is_laugh,\n",
    "    is_mengeluh,\n",
    "    multireplace,\n",
    "    case_of,\n",
    "    PUNCTUATION,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'14:14hrs'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word = '14:14hrs'\n",
    "word = multireplace(word, date_replace)\n",
    "word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "datetime.datetime(2022, 12, 3, 1, 35, 37, 152859)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dateparser.parse('14:14 hours', settings={'TIMEZONE': 'GMT+8'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['14:14hrs',\n",
       " 'Kerja',\n",
       " 'penyelenggaraan',\n",
       " 'KM',\n",
       " '83.5',\n",
       " '-',\n",
       " 'KM',\n",
       " '84.5',\n",
       " 'arah',\n",
       " 'Utara',\n",
       " 'dari',\n",
       " 'Ayer',\n",
       " 'Hitam',\n",
       " 'ke',\n",
       " 'Yong',\n",
       " 'Peng',\n",
       " '.',\n",
       " 'Lorong',\n",
       " 'kiri',\n",
       " 'ditutup',\n",
       " '.',\n",
       " 'Trafi',\n",
       " 'https://t.co/iP2jEdY4Ib']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalizer._tokenizer(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'14:14hrs Kerja penyelenggaraan KM 83.5- KM 84.5 arah Utara dari Ayer Hitam ke Yong Peng. Lorong kiri ditutup. Trafi https://t.co/iP2jEdY4Ib'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
